IMPORTING LIBRARIES


DATA READING

In [1]:
### importing required Libraries

import numpy as np
import pandas as pd

import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score
pd.set_option('display.max_columns', 100)
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold

import os

import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from itertools import combinations
In [ ]:
!pip install catboost
!pip install shap
import shap

from catboost import CatBoostClassifier, Pool
In [3]:
os.chdir('/content/drive/MyDrive/Colab Notebooks/jobathon')
In [4]:
## Reading Train Data

train=pd.read_csv('train_s3TEQDk.csv')



## Reading Test Data

test=pd.read_csv('test_mSzZ8RL.csv')

BASIC DATAFRAME CHECK

DATA TYPECASTING,MISSING-VALUES

In [5]:
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(3))
    print("##################### Tail #####################")
    print(dataframe.tail(3))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)
In [6]:
check_df(train)
##################### Shape #####################
(245725, 11)
##################### Types #####################
ID                     object
Gender                 object
Age                     int64
Region_Code            object
Occupation             object
Channel_Code           object
Vintage                 int64
Credit_Product         object
Avg_Account_Balance     int64
Is_Active              object
Is_Lead                 int64
dtype: object
##################### Head #####################
         ID  Gender  Age Region_Code     Occupation Channel_Code  Vintage  \
0  NNVBBKZB  Female   73       RG268          Other           X3       43   
1  IDD62UNG  Female   30       RG277       Salaried           X1       32   
2  HD3DSEMC  Female   56       RG268  Self_Employed           X3       26   

  Credit_Product  Avg_Account_Balance Is_Active  Is_Lead  
0             No              1045696        No        0  
1             No               581988        No        0  
2             No              1484315       Yes        0  
##################### Tail #####################
              ID  Gender  Age Region_Code Occupation Channel_Code  Vintage  \
245722  GEHAUCWT  Female   26       RG281   Salaried           X1       13   
245723  GE7V8SAH  Female   28       RG273   Salaried           X1       31   
245724  BOCZSWLJ    Male   29       RG269   Salaried           X1       21   

       Credit_Product  Avg_Account_Balance Is_Active  Is_Lead  
245722             No               670659        No        0  
245723             No               407504        No        0  
245724             No              1129276        No        0  
##################### NA #####################
ID                         0
Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         29325
Avg_Account_Balance        0
Is_Active                  0
Is_Lead                    0
dtype: int64
##################### Quantiles #####################
                        0.00      0.05      0.50       0.95       0.99  \
Age                     23.0      26.0      43.0       73.0       82.0   
Vintage                  7.0      13.0      32.0      110.0      123.0   
Avg_Account_Balance  20790.0  352992.8  894601.0  2673533.0  4472872.6   
Is_Lead                  0.0       0.0       0.0        1.0        1.0   

                           1.00  
Age                        85.0  
Vintage                   135.0  
Avg_Account_Balance  10352009.0  
Is_Lead                     1.0  
In [7]:
check_df(test)
##################### Shape #####################
(105312, 10)
##################### Types #####################
ID                     object
Gender                 object
Age                     int64
Region_Code            object
Occupation             object
Channel_Code           object
Vintage                 int64
Credit_Product         object
Avg_Account_Balance     int64
Is_Active              object
dtype: object
##################### Head #####################
         ID Gender  Age Region_Code Occupation Channel_Code  Vintage  \
0  VBENBARO   Male   29       RG254      Other           X1       25   
1  CCMEWNKY   Male   43       RG268      Other           X2       49   
2  VK3KGA9M   Male   31       RG270   Salaried           X1       14   

  Credit_Product  Avg_Account_Balance Is_Active  
0            Yes               742366        No  
1            NaN               925537        No  
2             No               215949        No  
##################### Tail #####################
              ID Gender  Age Region_Code Occupation Channel_Code  Vintage  \
105309  HDESC8GU   Male   35       RG254   Salaried           X4       15   
105310  2PW4SFCA   Male   53       RG254      Other           X3       93   
105311  F2NOYPPZ   Male   27       RG256   Salaried           X1       21   

       Credit_Product  Avg_Account_Balance Is_Active  
105309             No              1703727        No  
105310             No               737178       Yes  
105311             No               591565        No  
##################### NA #####################
ID                         0
Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         12522
Avg_Account_Balance        0
Is_Active                  0
dtype: int64
##################### Quantiles #####################
                        0.00      0.05      0.50        0.95        0.99  \
Age                     24.0      26.0      43.0       73.00       82.00   
Vintage                  7.0      13.0      32.0      109.00      123.00   
Avg_Account_Balance  22597.0  356062.0  896634.5  2699587.25  4601080.17   

                          1.00  
Age                       85.0  
Vintage                  135.0  
Avg_Account_Balance  9908858.0  
In [8]:
# removing nan which account for just above 10% base
train['Credit_Product']=train['Credit_Product'].fillna('NA')
test['Credit_Product']=test['Credit_Product'].fillna('NA')

print(train.isna().sum())
print(test.isna().sum())
ID                     0
Gender                 0
Age                    0
Region_Code            0
Occupation             0
Channel_Code           0
Vintage                0
Credit_Product         0
Avg_Account_Balance    0
Is_Active              0
Is_Lead                0
dtype: int64
ID                     0
Gender                 0
Age                    0
Region_Code            0
Occupation             0
Channel_Code           0
Vintage                0
Credit_Product         0
Avg_Account_Balance    0
Is_Active              0
dtype: int64
In [9]:
## typecasting train and test features

category_cols=['Gender','Region_Code','Channel_Code','Occupation','Credit_Product','Is_Active']
numerical_cols=['Age','Vintage','Avg_Account_Balance']
target_cols=['Is_Lead']
In [10]:
for col in category_cols:
  train[col]=train[col].astype('category')
  test[col]=test[col].astype('category')
  train[target_cols]=train[target_cols].astype('category')
In [11]:
for col in numerical_cols:
    train[col]=train[col].astype('float')
    test[col]=test[col].astype('float')
In [12]:
print(train.dtypes)
print(test.dtypes)
ID                       object
Gender                 category
Age                     float64
Region_Code            category
Occupation             category
Channel_Code           category
Vintage                 float64
Credit_Product         category
Avg_Account_Balance     float64
Is_Active              category
Is_Lead                category
dtype: object
ID                       object
Gender                 category
Age                     float64
Region_Code            category
Occupation             category
Channel_Code           category
Vintage                 float64
Credit_Product         category
Avg_Account_Balance     float64
Is_Active              category
dtype: object
In [13]:
### Age and Vintage Distribution
print(train['Age'].min(),train['Age'].max())
print(test['Age'].min(),test['Age'].max())
print(train['Vintage'].min(),train['Vintage'].max())
print(test['Vintage'].min(),test['Vintage'].max())
23.0 85.0
24.0 85.0
7.0 135.0
7.0 135.0
In [14]:
print(train['Region_Code'].nunique(),test['Region_Code'].nunique())
35 35
In [15]:
train['Region_Code'].value_counts()
Out[15]:
RG268    35934
RG283    29416
RG254    26840
RG284    19320
RG277    12826
RG280    12775
RG269     7863
RG270     7720
RG261     7633
RG257     6101
RG251     5950
RG282     5829
RG274     5286
RG272     5252
RG281     5093
RG273     4497
RG252     4286
RG279     3976
RG263     3687
RG275     3245
RG260     3110
RG256     2847
RG264     2793
RG276     2764
RG259     2586
RG250     2496
RG255     2018
RG258     1951
RG253     1858
RG278     1822
RG262     1788
RG266     1578
RG265     1546
RG271     1542
RG267     1497
Name: Region_Code, dtype: int64
In [16]:
## function to club all categories with levels below 4000 to O
def make_etc(x):
    if len(train[train['Region_Code']==x]) >= 5000:
        return "Top"
    else:
        return x

# Replace with 'etc' if category count is less than 4000
#train['Region_Code'] = train['Region_Code'].apply(make_etc)
#test['Region_Code'] = test['Region_Code'].apply(make_etc)
In [17]:
### Target Value distribution

print(train[target_cols].value_counts(normalize=True)*100)

## No Class imbalance in train data
Is_Lead
0          76.279174
1          23.720826
dtype: float64

EXPLORATORY DATA ANALYSIS


CONTINOUS VARIABLE DISTRIBUTION

PLOTS TO CHECK TARGET-Is_Lead AND OTHER CATEGORIES DISTRIBUTION

In [18]:
sns.displot((train['Avg_Account_Balance']))
sns.displot(test['Avg_Account_Balance'])

## Account Balance is right skewed in both train and test
Out[18]:
<seaborn.axisgrid.FacetGrid at 0x7f4e487c3210>
In [19]:
# Age Variable Distribution
sns.displot((train['Age']))
sns.displot((test['Age']))
Out[19]:
<seaborn.axisgrid.FacetGrid at 0x7f4e3ec2b410>
In [20]:
## Distribution of Vintage - converting to years
sns.displot((train['Vintage']/12))
sns.displot((test['Vintage']/12))
Out[20]:
<seaborn.axisgrid.FacetGrid at 0x7f4e48d1ce10>
In [21]:
#Gender vs Target Variable CountPlot
sns.countplot(train['Gender'],hue=train['Is_Lead'])
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f4e3e4d9310>
In [22]:
## Is_Active vs Target Distribution
sns.countplot(train['Is_Active'],hue=train['Is_Lead'])
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f4e3e285ad0>
In [23]:
## Occupation vs Target
sns.countplot(train['Occupation'],hue=train['Is_Lead'])
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f4e3e1e2390>
In [24]:
## Checking unique Region_Code, Channel_Code
train['Region_Code'].nunique(),train['Channel_Code'].nunique()
Out[24]:
(35, 4)
In [25]:
sns.countplot(train['Channel_Code'],hue=train['Is_Lead'])
Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f4e3e16ed90>
In [26]:
train['Avg_Account_Balance']=(train['Avg_Account_Balance']-train['Avg_Account_Balance'].min())/train['Avg_Account_Balance'].max()
test['Avg_Account_Balance']=(test['Avg_Account_Balance']-test['Avg_Account_Balance'].min())/test['Avg_Account_Balance'].max()
In [27]:
test['Age']=(test['Age']-test['Age'].min())/test['Age'].max()
train['Age']=(train['Age']-train['Age'].min())/train['Age'].max()
In [28]:
test['Vintage']=(test['Vintage']-test['Vintage'].min())/test['Vintage'].max()
train['Vintage']=(train['Vintage']-train['Vintage'].min())/train['Vintage'].max()
In [29]:
print(train['Vintage'].describe())
print(test['Vintage'].describe())
count    245725.000000
mean          0.295994
std           0.239653
min           0.000000
25%           0.096296
50%           0.185185
75%           0.488889
max           0.948148
Name: Vintage, dtype: float64
count    105312.000000
mean          0.295132
std           0.239064
min           0.000000
25%           0.096296
50%           0.185185
75%           0.488889
max           0.948148
Name: Vintage, dtype: float64
In [30]:
train['Age'].describe()
Out[30]:
count    245725.000000
mean          0.245368
std           0.174455
min           0.000000
25%           0.082353
50%           0.235294
75%           0.364706
max           0.729412
Name: Age, dtype: float64
In [31]:
## Converting Vintage from months to years
test['Vintage']=((test['Vintage']/12))
train['Vintage']=((train['Vintage']/12))

COMBINING TRAIN AND TEST


LABEL ENCODING CATEGORIES

In [32]:
train['train_or_test']=1
test['train_or_test']=0
df=pd.concat([train,test])
category_cols=['Gender','Region_Code','Channel_Code','Occupation','Credit_Product','Is_Active',]
numerical_cols=['Age','Vintage','Avg_Account_Balance']
target_cols=['Is_Lead']   
    
le = LabelEncoder()
for col in category_cols:
        df[col]=  df[col].astype('str')
        df[col]= le.fit_transform(df[col])
In [33]:
df.shape
Out[33]:
(351037, 12)

FEATURE ENGINEERING


FREQUENCY ENCODING,CREATING INTERACTION FEATURES AND EXTRACTING FEATURES FROM AGGREGATE MEASURES

In [34]:
def frequency_encoding(column_name,output_column_name,df):
    fe_pol = (df.groupby(column_name).size()) / len(df)
    df[output_column_name] = df[column_name].apply(lambda x : fe_pol[x])
    
def feature_engineering(df):
    cat_features=[]
    columns=['Gender','Region_Code','Channel_Code','Occupation','Credit_Product','Is_Active']
    comb = combinations(columns, 2) 
    for i in list(comb):  
        df[f'{i[0]}_{i[1]}']=df[i[0]].astype(str)+'_'+df[i[1]].astype(str)
        frequency_encoding(f'{i[0]}_{i[1]}',f'{i[0]}_{i[1]}',df)
        cat_features.append(f'{i[0]}_{i[1]}')

  #Frequency Encoding    
    frequency_encoding('Region_Code','Region_Code_fe',df)
    frequency_encoding('Channel_Code','Channel_Code_fe',df)
    frequency_encoding('Occupation','Occupation_fe',df)

     #Deriving characteristics of each Region by creating aggregate features
    Region_aggregate_features = df.groupby(['Region_Code']).agg({'Age': ['mean', 'max', 'min','std','sum'], 
                                                               'Vintage': ['mean', 'max', 'min','std','sum'], 
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std','sum'], 
                                                     'Channel_Code': ['nunique','count'], 
                                                     'Gender': ['nunique','count'],
                                                     'Credit_Product': ['nunique','count'] ,
                                                      'Occupation': ['nunique','count'] ,
                                                     'Is_Active' :['nunique','count'] 
                                                     })
    Region_aggregate_features.columns = ['Region_aggregate_features' + '_'.join(c).strip('_') for c in Region_aggregate_features.columns]
    df = pd.merge(df, Region_aggregate_features, on = ['Region_Code'], how='left')

    region_channel_aggregate_features = df.groupby(['Region_Code','Channel_Code']).agg({'Age': ['mean', 'max', 'min','std','sum'], 
                                                               'Vintage': ['mean', 'max', 'min','std','sum'], 
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std','sum'], 
                                                     'Channel_Code': ['nunique','count'], 
                                                     'Gender': ['nunique','count'],
                                                     'Credit_Product': ['nunique','count'] ,
                                                     'Occupation': ['nunique','count'] ,
                                                     'Is_Active' :['nunique','count'] 
                                                     })
    region_channel_aggregate_features.columns = ['region_channel_aggregate_features' + '_'.join(c).strip('_') for c in region_channel_aggregate_features.columns]
    df = pd.merge(df,region_channel_aggregate_features, on = ['Region_Code','Channel_Code'], how='left')

    region_occupation_aggregate_features = df.groupby(['Region_Code','Occupation']).agg({'Age': ['mean', 'max', 'min','std','sum'], 
                                                               'Vintage': ['mean', 'max', 'min','std','sum'], 
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std','sum'], 
                                                     'Channel_Code': ['nunique','count'], 
                                                     'Gender': ['nunique','count'],
                                                     'Credit_Product': ['nunique','count'] ,
                                                     'Occupation': ['nunique','count'] ,
                                                     'Is_Active' :['nunique','count'] 
                                                     })
    region_occupation_aggregate_features.columns = ['region_occupation_aggregate_features' + '_'.join(c).strip('_') for c in region_channel_aggregate_features.columns]
    df = pd.merge(df,region_occupation_aggregate_features, on = ['Region_Code','Occupation'], how='left')

    #age_aggregate_features=df.groupby(['Age']).agg({
                                                     #'Region_Code': ['nunique','count'],
                                                      #'Vintage': ['mean', 'max', 'min','std','sum'],  
                                                     #'Avg_Account_Balance': ['mean', 'max', 'min','std','sum'], 
                                                     #'Channel_Code': ['nunique','count'], 
                                                     #'Gender': ['nunique','count'],
                                                     #'Credit_Product': ['nunique','count'] ,
                                                     #'Occupation': ['nunique','count'] ,
                                                     #'Is_Active' :['nunique','count'] 
                                                    # })
    #age_aggregate_features.columns = ['age_aggregate_features' + '_'.join(c).strip('_') for c in age_aggregate_features.columns]
    #df = pd.merge(df, age_aggregate_features, on = ['Age'], how='left')

    
    #Vintage_aggregate_features=df.groupby(['Vintage']).agg({'Age': ['mean', 'max', 'min','std','sum'], 
                                                      #'Region_Code': ['nunique','count'],
                                                     #'Avg_Account_Balance': ['mean', 'max', 'min','std','sum'], 
                                                     #'Channel_Code': ['nunique','count'], 
                                                     #'Gender': ['nunique','count'],
                                                     #'Credit_Product': ['nunique','count'] ,
                                                     #'Occupation': ['nunique','count'] ,
                                                     #'Is_Active' :['nunique','count'] 
                                                     #})
    #Vintage_aggregate_features.columns = ['Vintage_aggregate_features' + '_'.join(c).strip('_') for c in Vintage_aggregate_features.columns]
    #df = pd.merge(df, Vintage_aggregate_features, on = ['Vintage'], how='left')

    Region_CodeChannel_Code_grpd = df.groupby(['Region_Code_Channel_Code']).agg({ 
                                                                                 'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})                                                              
                                                     
    Region_CodeChannel_Code_grpd.columns = ['grpd_by_Region_Code_Channel_Code' + '_'.join(c).strip('_') for c in Region_CodeChannel_Code_grpd.columns]
    df = pd.merge(df, Region_CodeChannel_Code_grpd, on = ['Region_Code_Channel_Code'], how='left')


    Channel_CodeOccupation_grpd = df.groupby(['Channel_Code_Occupation']).agg({  
                                                                             'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})    

    Channel_CodeOccupation_grpd.columns = ['grpd_by_Channel_Code_Occupation' + '_'.join(c).strip('_') for c in Channel_CodeOccupation_grpd.columns]
    df = pd.merge(df, Channel_CodeOccupation_grpd, on = ['Channel_Code_Occupation'], how='left')

    Occupation_Credit_grpd = df.groupby(['Occupation_Credit_Product']).agg({ 
                                                                             'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})    

    Occupation_Credit_grpd.columns = ['grpd_by_Credit_Occupation' + '_'.join(c).strip('_') for c in Occupation_Credit_grpd.columns]
    df = pd.merge(df, Occupation_Credit_grpd, on = ['Occupation_Credit_Product'], how='left')
    
    Occupation_Active_grpd = df.groupby(['Occupation_Is_Active']).agg({ 
                                                                             'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})    

    Occupation_Active_grpd.columns = ['grpd_by_Active_Occupation' + '_'.join(c).strip('_') for c in Occupation_Active_grpd.columns]
    df = pd.merge(df, Occupation_Active_grpd, on = ['Occupation_Is_Active'], how='left')

    Credit_Active_grpd = df.groupby(['Credit_Product_Is_Active']).agg({ 
                                                                             'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})    

    Credit_Active_grpd.columns = ['grpd_by_Active_Credit' + '_'.join(c).strip('_') for c in Credit_Active_grpd.columns]
    df = pd.merge(df, Credit_Active_grpd, on = ['Credit_Product_Is_Active'], how='left')

    Region_Active_grpd = df.groupby(['Region_Code_Is_Active']).agg({  
                                                                             'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})    

    Region_Active_grpd.columns = ['grpd_by_Active_Region' + '_'.join(c).strip('_') for c in Region_Active_grpd.columns]
    df = pd.merge(df, Region_Active_grpd, on = ['Region_Code_Is_Active'], how='left')

    Channel_Active_grpd = df.groupby(['Channel_Code_Is_Active']).agg({ 
                                                                             'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})    

    Channel_Active_grpd.columns = ['grpd_by_Active_Channel' + '_'.join(c).strip('_') for c in Channel_Active_grpd.columns]
    df = pd.merge(df, Channel_Active_grpd, on = ['Channel_Code_Is_Active'], how='left')

    Region_Credit_grpd = df.groupby(['Region_Code_Credit_Product']).agg({ 
                                                                             'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})    

    Region_Credit_grpd.columns = ['grpd_by_Region_Credit' + '_'.join(c).strip('_') for c in Region_Credit_grpd.columns]
    df = pd.merge(df, Region_Credit_grpd, on = ['Region_Code_Credit_Product'], how='left')

    Channel_Credit_grpd = df.groupby(['Channel_Code_Credit_Product']).agg({ 
                                                                            'Avg_Account_Balance': ['mean', 'max', 'min', 'std']})    

    Channel_Credit_grpd.columns = ['grpd_by_Channel_Credit' + '_'.join(c).strip('_') for c in Channel_Credit_grpd.columns]
    df = pd.merge(df, Channel_Credit_grpd, on = ['Channel_Code_Credit_Product'], how='left')




    return df,cat_features
In [35]:
df,cat_features=feature_engineering(df)
In [36]:
cat_cols=category_cols+cat_features
cat_cols
Out[36]:
['Gender',
 'Region_Code',
 'Channel_Code',
 'Occupation',
 'Credit_Product',
 'Is_Active',
 'Gender_Region_Code',
 'Gender_Channel_Code',
 'Gender_Occupation',
 'Gender_Credit_Product',
 'Gender_Is_Active',
 'Region_Code_Channel_Code',
 'Region_Code_Occupation',
 'Region_Code_Credit_Product',
 'Region_Code_Is_Active',
 'Channel_Code_Occupation',
 'Channel_Code_Credit_Product',
 'Channel_Code_Is_Active',
 'Occupation_Credit_Product',
 'Occupation_Is_Active',
 'Credit_Product_Is_Active']
In [37]:
for col in cat_cols:
  df[col]=df[col].astype('int')
In [38]:
df[cat_cols].dtypes
Out[38]:
Gender                         int64
Region_Code                    int64
Channel_Code                   int64
Occupation                     int64
Credit_Product                 int64
Is_Active                      int64
Gender_Region_Code             int64
Gender_Channel_Code            int64
Gender_Occupation              int64
Gender_Credit_Product          int64
Gender_Is_Active               int64
Region_Code_Channel_Code       int64
Region_Code_Occupation         int64
Region_Code_Credit_Product     int64
Region_Code_Is_Active          int64
Channel_Code_Occupation        int64
Channel_Code_Credit_Product    int64
Channel_Code_Is_Active         int64
Occupation_Credit_Product      int64
Occupation_Is_Active           int64
Credit_Product_Is_Active       int64
dtype: object
In [39]:
df.shape
Out[39]:
(351037, 141)

TRAIN_TEST SPLIT,DROPPING COLUMNS,TOTAL OF 138 FEATURES


MODEL BUILDING

In [40]:
train=df.loc[df.train_or_test.isin([1])]
test=df.loc[df.train_or_test.isin([0])]
    
drop_columns={'ID','Is_Lead','train_or_test'}
    
target=['Is_Lead']
    
x=train.drop(columns=drop_columns,axis=1)
y=train[target].astype('int')
x_test=test.drop(columns=drop_columns,axis=1)
print(x.shape)
(245725, 138)

MODEL-1


LGBM WITH STRATIFIED KFOLD

In [41]:
params= params=([('colsample_bytree', 0.3706219857878677),
             ('learning_rate', 0.018102685623591585),
             ('max_bin', 941),
             ('max_depth', 2),
             ('Is_unbalance',True),
             ('min_child_samples', 22),
             ('min_child_weight', 4),
             ('n_estimators', 12041),
             ('num_leaves', 17),
             ('reg_alpha', 1.081049236893711e-05),
             ('reg_lambda', 1.043686239159047),
             ('scale_pos_weight', 0.19222548462579486),
             ('subsample', 0.60),
             ('subsample_for_bin', 375140),
             ('category_features', cat_cols),
             ('subsample_freq', 7)])


  err = [] 

  oofs = np.zeros(shape=(len(x)))
  preds = np.zeros(shape=(len(x_test)))

  Folds=8

  fold = StratifiedKFold(n_splits=Folds, shuffle=True, random_state=2021)
  i = 1

  for train_index, test_index in fold.split(x, y):
      x_train, x_val = x.iloc[train_index], x.iloc[test_index]
      y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
      m = lgb.LGBMClassifier(params=params,verbose= -1)
    
      m.fit(x_train, y_train,eval_set=[(x_val, y_val)], early_stopping_rounds=50,verbose=False,eval_metric='auc',)
    
      pred_y = m.predict_proba(x_val)[:,1]
      oofs[test_index] = pred_y
      print(i, " err_lgm: ", roc_auc_score(y_val,pred_y))
      err.append(roc_auc_score(y_val,pred_y))
      preds+= m.predict_proba(x_test)[:,1]
      i = i + 1
  preds=preds/Folds
    
  print(f"Average StratifiedKFold Score : {sum(err)/Folds} ")
  oof_score = roc_auc_score(y, oofs)
  print(f'\nOOF Auc is : {oof_score}')
    
  oofs=pd.DataFrame(oofs,columns=['lgbmoof'])
  preds0=pd.DataFrame(preds,columns=['lgbmpred'])
1  err_lgm:  0.8737031589883674
2  err_lgm:  0.8681451392288885
3  err_lgm:  0.8796261933473757
4  err_lgm:  0.8760008231456465
5  err_lgm:  0.8749044349695608
6  err_lgm:  0.8707602982510736
7  err_lgm:  0.873063488010986
8  err_lgm:  0.8766706858727967
Average StratifiedKFold Score : 0.874109277726837 

OOF Auc is : 0.8740167930297762

SUBMISSIONS

LGBM

In [42]:
submission=pd.read_csv('sample_submission_eyYijxG.csv')
In [43]:
submission['Is_Lead']=preds0['lgbmpred']
submission.to_csv('lgbm_preds_30May_final.csv',index=False)

XGBOOST WITH STRATIFIED K FOLD


MODEL-2

In [44]:
params=([('colsample_bylevel', 0.3706219857878677),
             ('colsample_bytree', 0.6142670193823258),
             ('gamma', 0.1331227203252178),
             ('Is_unbalance', True),
             ('objective', 'binary:logistic'),
             ('tree_method', 'approx'),
             ('learning_rate', 0.03450570695385555),
             ('max_delta_step', 2),
             ('max_depth', 5),
             ('min_child_weight', 4),
             ('n_estimators', 97),
             ('reg_alpha', 1.081049236893711e-05),
             ('reg_lambda', 1.043686239159047),
             ('category_features', cat_cols),
             ('scale_pos_weight', 0.19222548462579486)])
            
             
err = [] 

oofs = np.zeros(shape=(len(x)))
preds = np.zeros(shape=(len(x_test)))

Folds=8

fold = StratifiedKFold(n_splits=Folds, shuffle=True, random_state=123)
i = 1

for train_index, test_index in fold.split(x, y):
    x_train, x_val = x.iloc[train_index], x.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    m2 = xgb.XGBClassifier(params=params,verbose=-1)
    
    m2.fit(x_train, y_train,eval_set=[(x_val, y_val)], early_stopping_rounds=50,verbose=1000,eval_metric='auc')
    
    pred_y = m2.predict_proba(x_val)[:,1]
    oofs[test_index] = pred_y
    print(i, " err_xgb: ", roc_auc_score(y_val,pred_y))
    err.append(roc_auc_score(y_val,pred_y))
    preds+= m2.predict_proba(x_test)[:,1]
    i = i + 1
preds=preds/Folds
    
print(f"Average StratifiedKFold Score : {sum(err)/Folds} ")
oof_score = roc_auc_score(y, oofs)
print(f'\nOOF Auc is : {oof_score}')
    
oofs1=pd.DataFrame(oofs,columns=['xgboof'])
preds1=pd.DataFrame(preds,columns=['xgbpred'])
[0]	validation_0-auc:0.844604
Will train until validation_0-auc hasn't improved in 50 rounds.
[99]	validation_0-auc:0.870014
1  err_xgb:  0.8700469589009449
[0]	validation_0-auc:0.84771
Will train until validation_0-auc hasn't improved in 50 rounds.
[99]	validation_0-auc:0.872639
2  err_xgb:  0.872638617035647
[0]	validation_0-auc:0.85112
Will train until validation_0-auc hasn't improved in 50 rounds.
[99]	validation_0-auc:0.875062
3  err_xgb:  0.8750643748867238
[0]	validation_0-auc:0.846325
Will train until validation_0-auc hasn't improved in 50 rounds.
[99]	validation_0-auc:0.871705
4  err_xgb:  0.8717049248970394
[0]	validation_0-auc:0.846967
Will train until validation_0-auc hasn't improved in 50 rounds.
[99]	validation_0-auc:0.869221
5  err_xgb:  0.869220532856176
[0]	validation_0-auc:0.853137
Will train until validation_0-auc hasn't improved in 50 rounds.
[99]	validation_0-auc:0.87717
6  err_xgb:  0.8771697377562315
[0]	validation_0-auc:0.849355
Will train until validation_0-auc hasn't improved in 50 rounds.
[99]	validation_0-auc:0.874683
7  err_xgb:  0.8746829579446593
[0]	validation_0-auc:0.849793
Will train until validation_0-auc hasn't improved in 50 rounds.
[99]	validation_0-auc:0.874644
8  err_xgb:  0.8746435680530732
Average StratifiedKFold Score : 0.8731464590413118 

OOF Auc is : 0.8730351248234431
In [45]:
submission1=pd.read_csv('sample_submission_eyYijxG.csv')
In [46]:
submission1['Is_Lead']=preds1['xgbpred']
In [47]:
submission1.shape
Out[47]:
(105312, 2)
In [48]:
submission1.to_csv('xgb_preds_30May_final.csv',index=False)

CATBOOST WITH STRATIFIED K FOLD

MODEL-3

In [49]:
err = [] 

oofs = np.zeros(shape=(len(x)))
preds = np.zeros(shape=(len(x_test)))

Folds=8

fold = StratifiedKFold(n_splits=Folds, shuffle=True, random_state=2020)
i = 1

for train_index, test_index in fold.split(x, y):
    x_train, x_val = x.iloc[train_index], x.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    m3 = CatBoostClassifier(iterations=50,learning_rate=0.40013,depth=5,border_count=101,bagging_temperature=0.086008,
                            l2_leaf_reg=6,random_strength=4.4552e-05,
                            scale_pos_weight=0.58518,verbose= -1,nan_mode='Min',cat_features=cat_cols)
    
    m3.fit(x_train, y_train,eval_set=[(x_val, y_val)],verbose=False)
    
    pred_y = m3.predict_proba(x_val)[:,1]
    oofs[test_index] = pred_y
    print(i, " err_lgm: ", roc_auc_score(y_val,pred_y))
    err.append(roc_auc_score(y_val,pred_y))
    preds+= m3.predict_proba(x_test)[:,1]
    i = i + 1
preds=preds/Folds
    
print(f"Average StratifiedKFold Score : {sum(err)/Folds} ")
oof_score = roc_auc_score(y, oofs)
print(f'\nOOF Auc is : {oof_score}')
    
oofs=pd.DataFrame(oofs,columns=['cbmoof'])
preds3=pd.DataFrame(preds,columns=['cbmpred'])
1  err_lgm:  0.8736799530996777
2  err_lgm:  0.8737649974243016
3  err_lgm:  0.8721937891751309
4  err_lgm:  0.8723026866813137
5  err_lgm:  0.8757431625077661
6  err_lgm:  0.8743312754555856
7  err_lgm:  0.8732886120203116
8  err_lgm:  0.8735101801604832
Average StratifiedKFold Score : 0.8736018320655713 

OOF Auc is : 0.873528303890563
In [50]:
submission2=pd.read_csv('sample_submission_eyYijxG.csv')
In [51]:
submission2['Is_Lead']=preds3['cbmpred']
In [52]:
submission2.shape
Out[52]:
(105312, 2)
In [53]:
submission2.to_csv('cb_preds_30May_final.csv',index=False)

FEATURE IMPORTANCES


LGBM

In [54]:
import pandas as pd

def get_lgbm_varimp(model, train_columns, max_vars=50):
    
    if "basic.Booster" in str(model.__class__):
        # lightgbm.basic.Booster was trained directly, so using feature_importance() function 
        cv_varimp_df = pd.DataFrame([train_columns, model.feature_importance()]).T
    else:
        # Scikit-learn API LGBMClassifier or LGBMRegressor was fitted, 
        # so using feature_importances_ property
        cv_varimp_df = pd.DataFrame([train_columns, model.feature_importances_]).T

    cv_varimp_df.columns = ['feature_name', 'varimp']

    cv_varimp_df.sort_values(by='varimp', ascending=False, inplace=True)

    cv_varimp_df = cv_varimp_df.iloc[0:max_vars]   

    return cv_varimp_df
In [55]:
get_lgbm_varimp(m,x.columns,max_vars=25)
Out[55]:
feature_name varimp
1 Age 353
5 Vintage 324
7 Avg_Account_Balance 148
77 region_occupation_aggregate_featuresr_e_g_i_o_... 115
6 Credit_Product 77
80 region_occupation_aggregate_featuresr_e_g_i_o_... 57
85 region_occupation_aggregate_featuresr_e_g_i_o_... 48
116 grpd_by_Active_OccupationAvg_Account_Balance_min 37
60 region_channel_aggregate_featuresVintage_std 36
118 grpd_by_Active_CreditAvg_Account_Balance_mean 34
55 region_channel_aggregate_featuresAge_std 30
82 region_occupation_aggregate_featuresr_e_g_i_o_... 27
131 grpd_by_Region_CreditAvg_Account_Balance_max 27
57 region_channel_aggregate_featuresVintage_mean 27
0 Gender 27
136 grpd_by_Channel_CreditAvg_Account_Balance_min 26
132 grpd_by_Region_CreditAvg_Account_Balance_min 24
63 region_channel_aggregate_featuresAvg_Account_B... 23
112 grpd_by_Credit_OccupationAvg_Account_Balance_min 22
124 grpd_by_Active_RegionAvg_Account_Balance_min 22
3 Occupation 21
130 grpd_by_Region_CreditAvg_Account_Balance_mean 21
56 region_channel_aggregate_featuresAge_sum 18
110 grpd_by_Credit_OccupationAvg_Account_Balance_mean 18
128 grpd_by_Active_ChannelAvg_Account_Balance_min 18
In [70]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def plotImp(model):
    feature_imp = get_lgbm_varimp(m,x.columns,max_vars=20)
    plt.figure(figsize=(40, 20))
    sns.set(font_scale = 5)
    sns.barplot(x="varimp", y="feature_name", data=feature_imp.sort_values(by="varimp", 
                                                        ascending=False)[0:10])
    plt.title('LGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgb_importances-01.png')
    plt.show()
In [71]:
plotImp(m)

FEATURE IMPORTANCES


XGBOOST

In [72]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def plotImp(model):
    feature_imp = get_lgbm_varimp(m2,x.columns,max_vars=20)
    plt.figure(figsize=(40, 20))
    sns.set(font_scale = 5)
    sns.barplot(x="varimp", y="feature_name", data=feature_imp.sort_values(by="varimp", 
                                                        ascending=False)[0:10])
    plt.title('XGB Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('XGB_importances.png')
    plt.show()
In [73]:
get_lgbm_varimp(m2,x.columns,max_vars=25)
Out[73]:
feature_name varimp
118 grpd_by_Active_CreditAvg_Account_Balance_mean 0.378127
136 grpd_by_Channel_CreditAvg_Account_Balance_min 0.253317
120 grpd_by_Active_CreditAvg_Account_Balance_min 0.111547
6 Credit_Product 0.083496
77 region_occupation_aggregate_featuresr_e_g_i_o_... 0.0220246
116 grpd_by_Active_OccupationAvg_Account_Balance_min 0.0188965
85 region_occupation_aggregate_featuresr_e_g_i_o_... 0.0139707
134 grpd_by_Channel_CreditAvg_Account_Balance_mean 0.0118983
5 Vintage 0.0113675
1 Age 0.00713285
110 grpd_by_Credit_OccupationAvg_Account_Balance_mean 0.00653856
112 grpd_by_Credit_OccupationAvg_Account_Balance_min 0.00652008
57 region_channel_aggregate_featuresVintage_mean 0.00574121
90 region_occupation_aggregate_featuresr_e_g_i_o_... 0.00515842
113 grpd_by_Credit_OccupationAvg_Account_Balance_std 0.00405439
108 grpd_by_Channel_Code_OccupationAvg_Account_Bal... 0.00395794
111 grpd_by_Credit_OccupationAvg_Account_Balance_max 0.00366058
109 grpd_by_Channel_Code_OccupationAvg_Account_Bal... 0.00362319
107 grpd_by_Channel_Code_OccupationAvg_Account_Bal... 0.00322657
135 grpd_by_Channel_CreditAvg_Account_Balance_max 0.0031135
3 Occupation 0.00307701
78 region_occupation_aggregate_featuresr_e_g_i_o_... 0.00295188
80 region_occupation_aggregate_featuresr_e_g_i_o_... 0.00279743
126 grpd_by_Active_ChannelAvg_Account_Balance_mean 0.0021657
119 grpd_by_Active_CreditAvg_Account_Balance_max 0.00215968
In [74]:
plotImp(m2)

FEATURE IMPORTANCES


CATBOOST SHAP VALUES

In [61]:
shap.initjs()
In [62]:
explainer = shap.TreeExplainer(m3)
In [63]:
categorical_features_indices = np.where(x_train.columns.isin(cat_cols))[0]
categorical_features_indices
Out[63]:
array([ 0,  2,  3,  4,  6,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20, 21, 22, 23])
In [64]:
shap_values = explainer.shap_values(Pool(x_train, y_train,cat_features=categorical_features_indices))
In [65]:
# summarize the effects of all the features
shap.summary_plot(shap_values,x_train)
In [69]:
shap.initjs()
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values[0:500,:], x_train.iloc[0:500,:])
Out[69]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [76]:
# feature importance plot
shap.summary_plot(shap_values, x_train, plot_type="bar",color='rgb')
In [68]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[4,:], x_train.iloc[4,:])
Out[68]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.